/*

Copyright (C) 1997,1998,1999,2000,2001  Franz Josef Och

mkcls - a program for making word classes .

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA.

*/




#include "KategProblemTest.h"

#include "ProblemTest.h"
#include "HCOptimization.h"
#include "TAOptimization.h"
#include "RRTOptimization.h"
#include "GDAOptimization.h"

#include <cstdlib>
#include <cstdio>
#include <string>
#include <strstream>

typedef pair<string,string> PSS;

#define NEW_SENTENCE_END "mkcls-mapped-dollar-symbol-$"

#ifdef NeXT
char *strdup(char *a)
{
  char *p = (char *)malloc(strlen(a)+1);
  strcpy(p,a);
  return p;
}

#endif


void writeClasses(Array<Kategory> &katOfWord,KategProblem &problem,ostream &to)
{
  for(int i=0; i<katOfWord.size(); i++) {
    if( strcmp(problem.getString(i),"$") )
      if( strcmp(problem.getString(i),"mkcls-mapped-dollar-symbol-$")==0 )
        to << "$" << "\t" << katOfWord[i] << endl;
      else
        to << problem.getString(i) << "\t" << katOfWord[i] << endl;
  }
}


void mysplit(const string &s,string &s1,string &s2)
{
  unsigned int i=0;
  for(; i<s.length(); i++)if( s[i]==' ' || s[i]=='\t' || s[i]==' ')break;
  s1=s.substr(0,i);
  for(; i<s.length(); i++)if( !(s[i]==' ' || s[i]=='\t' || s[i]==' ') )break;
  s2=s.substr(i,s.length()-i);

  iassert(s1.size());
  iassert(s2.size());
}



int fromCatFile(KategProblem *p,const char *fname,bool verb)
{
  leda_h_array<string,int> translation(-1);
  int maxCat=2;
  ifstream in(fname);
  if(!in) {
    cerr << "Error: File '" << fname << "' cannot be opened.\n";
    exit(1);
  }
  for(int i=0; i<p->wordFreq.nWords; i++)
    (p->initLike)[i]= -1;


  translation["1"]=1;
  translation["0"]=0;


  string s;
  while( getline(in,s) ) {
    string str,categ;
    mysplit(s,str,categ);
    int i=p->words->binary_locate(str);
    if(i>=0 && (*(p->words))[i]==str ) {

      if( translation[categ]==-1 )
        translation[categ]=maxCat++;
      int cat=translation[categ];
      if( (p->initLike)[i]!= -1 )
        cerr << "Warning: Word '" << ((*(p->words))[i])<< "' is already in a category.\n";
      (p->initLike)[i]=cat;
    } else
      cerr << "Warning: Word '" << str << "' " << i << " is not in training corpus.\n";
  }

  if( verboseMode )
    cout << "We have " << maxCat << " read non-empty categories"
         " (with words from the corpus).\n";

  if(maxCat>p->katFreq.nKats) {
    cerr << "Error: Not enough categories reserved (only "
         << p->katFreq.nKats << ", but i need " << maxCat << ").\n";
    exit(1);
  }


  int i=p->words->binary_locate("$");
  if( i>=0 &&  (*(p->words))[i]=="$" )
    (p->initLike)[i]=0;
  else if( verboseMode )
    cerr << "Warning: No '$' in vocabulary!\n";


  int errors=0;
  for(i=0; i<p->wordFreq.nWords; i++)
    if((p->initLike)[i]== -1 ) {
      if( verb ) cerr << "Error: I don't know the category of word " << i
                        << " (" << (*(p->words))[i] << ") " << ".\n";
      errors=1;
    }
  return errors;
}



KategProblem *makeKategProblem(const leda_h_array<PSS,FreqType>&cTbl,const leda_set<string>&setVokabular, int maxClass,int initialisierung,
                               int auswertung,int nachbarschaft,int minWordFrequency)
{

  int nwrd=0;
  leda_array<string>&sVok = *new leda_array<string>(setVokabular.size());
  string s;
  unsigned int ctr=0;
  forall_set(leda_set<string>,s,setVokabular) {
    if( verboseMode>2 )
      cout << "mkcls:Wort " << ctr << " " << s << endl;
    sVok[ctr++]=s;
  }
  for(unsigned int z=0; z<ctr-1; z++)
    iassert( sVok[z]<sVok[z+1] );
  sVok.sort();

  if( verboseMode>2 )
    cout << "*****Vocabulary: " << sVok;

  unsigned int vokSize=sVok.size();
  massert(vokSize==ctr);
  massert(vokSize==setVokabular.size());
  if(verboseMode) {
    cout << "Size of vocabulary: " << vokSize << "\n";
    cout.flush();
  }

  KategProblem *k = new KategProblem(vokSize,maxClass,initialisierung,
                                     auswertung,nachbarschaft,minWordFrequency);
  KategProblemWBC &w=k->wordFreq;
  k->words=&sVok;

  Array<int> after(vokSize,0);
  Array<int> before(vokSize,0);


  nwrd=0;
  {
    PSS s;
    forall_defined_h2(PSS,FreqType,s,cTbl) {
      const string&ss1=s.first;
      const string&ss2=s.second;
      if( ss2.length()&&(ss1!="$" || ss2!="$") ) {
        int i1=sVok.binary_search(ss1);
        int i2=sVok.binary_search(ss2);
        iassert( sVok[i1] == ss1 );
        iassert( sVok[i2] == ss2 );
        after[i1]++;
        before[i2]++;
      }
      if( verboseMode&&((nwrd++)%10000==0) ) {
        cout<<"Statistiken-1 " << nwrd<< ".      \r";
        cout.flush();
      }
    }
  }

  for(unsigned int i=0; i<vokSize; i++) {
    w.setAfterWords(i,after[i]);
    w.setBeforeWords(i,before[i]);
  }


  {
    nwrd=0;
    PSS s;
    forall_defined_h2(PSS,FreqType,s,cTbl) {
      const string&ss1=s.first;
      const string&ss2=s.second;
      FreqType p=cTbl[s];
      if( ss2.length()&&(ss1!="$" || ss2!="$") ) {
        int i1=sVok.binary_search(ss1);
        int i2=sVok.binary_search(ss2);
        iassert( sVok[i1] == ss1 );
        iassert( sVok[i2] == ss2 );
        w.setFreq(i1,i2,p);
        if( verboseMode>2 )
          cout << "BIGRAMM-HAEUF: " << ss1 << ":" << i1 << "   "
               << ss2 << ":" << i2 << "   " << p << endl;
      }
      if( verboseMode&&((nwrd++)%10000==0) ) {
        cout<<"Statistiken-2 " <<nwrd<< ".    \r";
        cout.flush();
      }
    }
  }

  w.testFull();
  if(verboseMode) {
    cout << "Datenintegritaet getestet.\n";
    cout.flush();
  }
  return k;
}

KategProblem *fromNgrFile(const char *str,int maxClass,int initialisierung,
                          int auswertung,int nachbarschaft,int minWordFrequency)
{
  ifstream file(str);
  if(!file)return 0;
  leda_set<string> setVokabular;
  leda_h_array<PSS,FreqType> cTbl;
  double c=0;
  if( verboseMode )cout << "NGRFILE: " << str << endl;
  string s1,s2;
  while(file >> c >> s1 >> s2) {
    if( s1.length()==0||s2.length()==0 ) {
      cerr << "ERROR: strings are zero: " << s1.length() <<" " << s1 <<" " << s2.length()<<" " << s2 << endl;
      return 0;
    }
    if( c==0 ) {
      cerr << "Count ist 0 " << s1 << " " << s2 << endl;
      return 0;
    }
    cTbl[pair<string,string>(s1,s2)]=(FreqType)c;
    setVokabular.insert(s1);
    setVokabular.insert(s2);
    if( verboseMode>1 )
      cout << "R: " << s1 << " " << s2 << " " << c << endl;
    c=0;
  }

  return makeKategProblem(cTbl,setVokabular,maxClass,initialisierung,auswertung,nachbarschaft,minWordFrequency);
}








KategProblem *fromKModel(const char *str,int maxClass,int initialisierung,
                         int auswertung,int nachbarschaft,int minWordFrequency)
{
  string oldText,text,line;
  ifstream f(str);
  if( !f ) {
    cerr << "ERROR: can not open file " << str << ".\n";
    return 0;
  }

  leda_set<string> setVokabular;
  leda_h_array<PSS,FreqType> cTbl(0);
  oldText="$";
  while(1) {
    getline(f,line);
    if(f.fail() && !f.bad() && !f.eof()) {
      cerr << "WARNING: strange characters in stream (getline) " << endl;
      f.clear();
    }
    if(!f)break;

    istrstream f2(line.c_str());
    while( 1 ) {
      f2 >> text;
      if(f2.fail() && !f2.bad() && !f2.eof()) {
        cerr << "WARNING: strange characters in stream (>>) !\n";
        f2.clear(ios::failbit);
      }
      if(!f2) {
        break;
      }






      if( text == "$" )
        text = "mkcls-mapped-dollar-symbol-$";
      if( !setVokabular.member(text) )setVokabular.insert(text);
      cTbl[pair<string,string>(oldText,text)]++;
      oldText=text;
    }
    text="$";
    if( !setVokabular.member(text) )setVokabular.insert(text);
    cTbl[pair<string,string>(oldText,text)]++;
    oldText=text;
  }
  return makeKategProblem(cTbl,setVokabular,maxClass,initialisierung,auswertung,nachbarschaft,minWordFrequency);
}





void KategProblemSetParameters(KategProblem &p)
{
  if( p.katwahl()==K_BEST ) {
    TAOptimization::defaultAnnRate=0.7;
    RRTOptimization::defaultAnnRate=0.95;
    GDAOptimization::defaultAlpha=0.05;
    if( verboseMode )
      cout << "Parameter-setting like W-DET-BEST\n";
  } else {
    TAOptimization::defaultAnnRate=0.4;
    RRTOptimization::defaultAnnRate=0.6;
    GDAOptimization::defaultAlpha=0.0125;
    if( verboseMode )
      cout << "Parameter-setting like W-DET-DET\n";
  }
}




KategProblem &makRandom(int ANZ_WORD,int ANZ_CLS,int initValue,
                        int auswertung,int nachbarschaft,float relInit)
{
  KategProblem &k=
    *new KategProblem(ANZ_WORD,ANZ_CLS,initValue,auswertung,nachbarschaft);
  KategProblemWBC &w=k.wordFreq;
  Array<int> after(ANZ_WORD,0);
  Array<int> before(ANZ_WORD,0);
  Array<FreqArray> twoD(ANZ_WORD);
  int i;
  for(i=0; i<ANZ_WORD; i++) twoD[i].init(ANZ_WORD,0);

  for(i=0; i<ANZ_WORD; i++) {
    massert(after[i]==0);
    massert(before[i]==0);
    for(int j=0; j<ANZ_WORD; j++) {
      massert(twoD[i][j]==0);
    }
  }
  for(i=0; i<ANZ_WORD*ANZ_WORD*relInit; i++) {
    int x=randomInt(ANZ_WORD);
    int y=randomInt(ANZ_WORD);
    if(twoD[x][y]==0) {
      after[x]++;
      before[y]++;
    }
    twoD[x][y]+=randomInt(10)+1;
  }
  for(i=0; i<ANZ_WORD; i++) {
    w.setAfterWords(i,after[i]);
    w.setBeforeWords(i,before[i]);
  }

  for(i=0; i<ANZ_WORD; i++) {
    for(int j=0; j<ANZ_WORD; j++)
      if( twoD[i][j] )
        w.setFreq(i,j,twoD[i][j]);
  }
  w.testFull();
  return k;
}




char *makeTitle(KategProblem &problem,int verfahren)
{
  char x[1024];
  switch(verfahren) {
  case HC_OPT:
    strcpy(x,"HC   ");
    break;
  case SA_OPT:
    strcpy(x,"SA     ");
    break;
  case TA_OPT:
    strcpy(x,"TA     ");
    break;
  case GDA_OPT:
    strcpy(x,"GDA    ");
    break;
  case RRT_OPT:
    strcpy(x,"RRT    ");
    break;
  }
  problem.makeTitle(x+strlen(x));
  return strdup(x);
}




#define MAX_MULTIPLE 10

Array<KategProblem *> &_izrOptimization(Array<KategProblem *> &probs,
                                        int anzprob,double timeForOneRed,double maxClock,Array<Kategory> &katOfWord,
                                        int anzIter,int verfahren)
{
  massert(anzprob>1);
  massert(probs[0]->wordFreq.mindestAnzahl<=1);
  KategProblem *p0=probs[0];

  int nWords=p0->wordFreq.nWords;
  int nKats=p0->katFreq.nKats;
  int minimumNumberOfWords = max(1,int(nWords*0.95));

  int indexOfDurchschnitt;
  Array<int> newWords(nWords);
  int useAnzprob=anzprob;
  do {
    int w,k;
    indexOfDurchschnitt=0;
    for(w=0; w<nWords; w++)
      newWords[w]=-1;
    for(k=0; k<useAnzprob; k++) {
      massert(probs[k]->wordFreq.nWords==nWords);
      probs[k]->makeKats();
    }

    for(w=0; w<nWords; w++) {
      if( newWords[w]==-1 ) {



        leda_set<int> durchschnitt=(*p0->kats)[p0->katOfWord(w)];
        for(k=1; k<useAnzprob; k++)
          durchschnitt = durchschnitt & (*probs[k]->kats)[probs[k]->katOfWord(w)];


        int _anzInDurchschnitt=0;
        int nr=0;
        forall_set(leda_set<int>,nr,durchschnitt) {
          _anzInDurchschnitt++;
          newWords[nr]=indexOfDurchschnitt;
        }
        if( verboseMode && _anzInDurchschnitt>1 && anzIter==0 ) {
          cout << "- (";
          forall_set(leda_set<int>,nr,durchschnitt) {
            cout << p0->getString(nr);
            if( p0->wordFreq.n1(nr)==1 )
              cout << "* ";
            else
              cout << " ";
          }
          cout << ")\n";
        }




        for(k=0; k<useAnzprob; k++) {
          durchschnitt = durchschnitt - (*probs[k]->kats)[probs[k]->katOfWord(w)];
        }
        indexOfDurchschnitt++;
      }
    }

    if(indexOfDurchschnitt>=minimumNumberOfWords) {
      if(useAnzprob==1) {
        cout << "useAnzProb==1 => mysterious.\n";
        break;
      }
      useAnzprob--;
    }
  } while(indexOfDurchschnitt>=minimumNumberOfWords);


  Array<KategProblem *> &neu=*new Array<KategProblem *>(MAX_MULTIPLE*anzprob,(KategProblem *)0);
  qsort(probs.getPointerToData(),useAnzprob,sizeof(KategProblem *),compareProblem);
  massert(useAnzprob<=probs.size());
  double startTime=clockSec();
  int i, numberOfNew;
  for(numberOfNew=0; (clockSec()-startTime<timeForOneRed)
      || (numberOfNew < anzprob) ; numberOfNew++) {
    int w;
    if( numberOfNew==anzprob*MAX_MULTIPLE-1 )
      break;
    KategProblem *p
    = neu[numberOfNew]
      = new KategProblem(indexOfDurchschnitt,nKats-2,
                         p0->initialisierung,p0->auswertung,p0->nachbarschaft);

    for(w=0; w<indexOfDurchschnitt; w++) {
      p->wordFreq.setAfterWords(w,5);
      p->wordFreq.setBeforeWords(w,5);
    }
    for(w=0; w<nWords; w++) {
      Array<OneFreq> &after=p0->wordFreq.after[w];
      int size=after.size();
      for(i=0; i<size; i++)
        p->wordFreq.addFreq(newWords[w],newWords[after[i].w],after[i].n);
    }
    p->wordFreq.testFull(1);






    p->wordFreq.set_h_of_words(p0->wordFreq.get_h_of_words());
    double w1=0.0,w2=0.0;
    if(numberOfNew<useAnzprob) {

      for(i=0; i<nWords; i++)
        (p->initLike)[newWords[i]]=probs[numberOfNew]->katOfWord(i);
      p->_initialize(5);
      HCOptimization hc(*p,-1);
      if(verboseMode) {
        w1=p->nicevalue();
        cout << "from old category system:" << w1 << endl;
      }
      hc.minimize(-1);
      if(verboseMode) {
        w2=p->nicevalue();
        if(w2<w1)
          cout << "improvement: " << w1-w2 << endl;
      }
    } else {
      p->_initialize(1);
      double mean;
      StatVar end,laufzeit,start;
      solveProblem(0,*p,1,-1,verfahren,mean,end,laufzeit,start);
      w2=p->value();
      if(verboseMode)
        cout << "new category system: " << w2 << " (" << p->nicevalue()
             << ") Zeit: " << clockSec() << "\n";
    }
  }
  int p;
  for(p=0; p<probs.size(); p++) {
    if( probs[p] )
      delete probs[p];
  }
  qsort(neu.getPointerToData(),numberOfNew,sizeof(Problem *),compareProblem);
  massert(numberOfNew<=neu.size());
  if( verboseMode )
    cout << "Iterierte Zustandsraum-Reduktion: " << indexOfDurchschnitt
         << " words. costs: " << neu[0]->value() << " "
         << neu[0]->nicevalue() << " (" << numberOfNew-anzprob << ")" << "time: "
         << clockSec() << endl;
  if( indexOfDurchschnitt<=nKats
      || (clockSec()>maxClock&&maxClock) ) {
    if( clockSec()>maxClock&&maxClock )
      cout << "STOP (time limit: " << (clockSec()-maxClock) << " s)\n";
    for(i=0; i<nWords; i++)
      katOfWord[i]=neu[0]->katOfWord(newWords[i]);
    return neu;
  } else {
    Array<Kategory> &newKatOfWord=
      *(new Array<Kategory>(neu[0]->wordFreq.nWords,-1));
    Array<KategProblem *> &erg=_izrOptimization(neu,anzprob,timeForOneRed,
                               maxClock,newKatOfWord,
                               anzIter+1,verfahren);
    for(i=0; i<nWords; i++)
      katOfWord[i]=newKatOfWord[newWords[i]];
    return erg;
  }
}




KategProblem *izrOptimization(KategProblem &p,int minN,int firstN,
                              double clockForOneRed,double maxClock,int verfahren)
{
  Array<Kategory> katOfWord(p.wordFreq.nWords,-1);
  int startN;
  if( clockForOneRed<=0 )
    startN=firstN;
  else
    startN=1000;
  Array<KategProblem *> probs(startN);
  double val1=0.0,val2=0.0;
  double endTime=-1;

  double startTime=clockSec();
  int i;
  for(i=0; i<startN; i++) {
    StatVar end,laufzeit,start;
    double mean;
    probs[i] = (KategProblem *)((KategProblem *)p.makeEqualProblem());
    solveProblem(0,*(probs[i]),1,-1,verfahren,mean,end,laufzeit,start);
    if( i==minN-1 )
      endTime = clockSec();
    if( i>=firstN-1 && (startTime+clockForOneRed>clockSec() || i==999) )
      break;
  }
  if( endTime<0 )
    endTime=clockSec();
  massert(i>=firstN);

  qsort(probs.getPointerToData(),i,sizeof(KategProblem *),compareProblem);
  massert(i<=probs.size());
  if( clockForOneRed<=0 ) {
    clockForOneRed=endTime-startTime;
    if( verboseMode )
      cout << "time for one reduction: " << clockForOneRed << endl;
  }
  _izrOptimization(probs,minN,clockForOneRed,maxClock,katOfWord,0,verfahren);

  KategProblem *n=(KategProblem *)(p.makeEqualProblem());
  n->initLike= katOfWord;
  n->_initialize(5);
  if( verboseMode )
    val1=n->value();
  HCOptimization hc(*n,-1);
  hc.minimize(-1);
  val2=n->value();
  if( verboseMode )
    cout << "last improvement: " << val2-val1 << "\n";
  cout << "final costs: " << val2 << " " << n->nicevalue() << endl;
  if(PrintBestTo)
    n->dumpOn(*PrintBestTo);
  return n;
}











